/* This is to clean the test score dataset */

/* Notes: download the test score files, create another excel file with two sheets
one named "grade10" and one named "grade12" and put in local disk 
* make sure to delete all test score related files in dropbox */

clear 
import excel using $testscoredata, sheet("grade10") firstrow

rename *, lower

* passing grades 
g passing_eng = ca1>=12 & th1>=28 if sub1=="ENG"
g passing_dzo = ca2>=12 & th2>=28 if sub2=="DZO"
replace passing_eng = . if missing(ca1) | missing(th1)
replace passing_dzo = . if missing(ca2) | missing(th2)

lab var passing_eng "1 = Pass BCSE English module"
lab var passing_dzo "1 = Pass BCSE Dzongkhag module"

* numeric grade 
g score_eng = tot1 if sub1=="ENG"
g score_dzo = tot2 if sub2=="DZO"

lab var score_eng "total score BCSE English module"
lab var score_dzo "total score BCSE Dzongkhag module"

// other subjects 
foreach sub in math phy che bio his geo {
    g ca`sub' = .
	g th`sub' = .
}

forval i=3/9 {
    foreach j in ca th {
    replace `j'math = `j'`i' if sub`i'=="MAT"
	replace `j'phy = `j'`i' if sub`i'=="PHY"
	replace `j'che = `j'`i' if sub`i'=="CHE"
	replace `j'bio = `j'`i' if sub`i'=="BIO"
	replace `j'his = `j'`i' if sub`i'=="HIS"
	replace `j'geo = `j'`i' if sub`i'=="GEO"
	}
}

foreach sub in math phy che bio his geo {
    g passing_`sub' = ca`sub'>=16 & th`sub'>=24 
	replace passing_`sub' = . if missing(ca`sub') | missing(th`sub')
	lab var passing_`sub' "1 = Pass BCSE `sub' module"
	g score_`sub' = ca`sub' + th`sub'
	lab var score_`sub' "total score BCSE `sub' module"
}

// letter grade 
foreach sub in eng dzo math phy che bio his geo {
g score_l`sub' = "A+" if score_`sub'>=91 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "A" if score_`sub'>=81 & score_`sub'<=90 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "B+" if score_`sub'>=71 & score_`sub'<=80 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "B" if score_`sub'>=61 & score_`sub'<=70 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "C+" if score_`sub'>=51 & score_`sub'<=60 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "C" if score_`sub'>=46 & score_`sub'<=50 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "D" if score_`sub'>=40 & score_`sub'<=45 & ~missing(score_`sub') & passing_`sub'
lab var score_l`sub' "letter grade of `sub' in BCSE"
}

g passing_bcse = result == "PASS CERTIFICATE AWARDED"
lab var passing_bcse "1 = BCSE Pass Certificate Awarded"

// meeting entry requirement to science stream if 
// passing math + average science >= 55 + each science > C
g average_science = (score_bio + score_phy + score_che)/3
g passing_science = passing_math & average_science>=55 & inlist(score_lphy, "A+", "A", "B+", "B", "C+", "C") & inlist(score_lche, "A+", "A", "B+", "B", "C+", "C") & inlist(score_lbio, "A+", "A", "B+", "B", "C+", "C") & passing_bcse 
lab var passing_science "1 = meet entry requirement for science stream"

rename cidnumber citizenid
replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"

rename contactnumber phone 

g sex = gender=="M"
replace sex = 2 if gender=="F"

g name = studentname
replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

replace schoolname = trim(itrim(lower(schoolname)))
replace dateofbirth = subinstr(dateofbirth,"-","",.)

g day = substr(dateofbirth, 7, 2)
g month = substr(dateofbirth, 5, 2)
g year = substr(dateofbirth, 1, 4)

g birthday = day + month + year

// some schools with variant names in the score database compared to ours
*/ make sure to correct, either way to match
replace schoolname = "yangchen gatshel middle secondary school" if schoolname == "yangchengatshel middle secondary school"
replace schoolname = "bajo higher secondary school" if schoolname == "bajothang higher secondary school"
replace schoolname = "baylling central school" if schoolname == "bayling central school"
replace schoolname = "betikha middle secondary school" if schoolname == "bitekha middle secondary school"
replace schoolname = "dechencholing higher secondary school" if schoolname == "dechhenchoeling higher secondary school"
replace schoolname = "elc high school" if schoolname == "educating for lifelong citizenship high school"
replace schoolname = "garpawoong middle secondary school" if schoolname == "garpawong middle secondary school"
replace schoolname = "gyelpozhing central school" if schoolname == "gyalpoizhing higher secondary school"
replace schoolname = "jomotsangkha middle secondary school" if schoolname == "jomotshangkha middle secondary school"
replace schoolname = "kuzhugchen middle secondary school" if schoolname == "kuzuchen middle secondary school"
replace schoolname = "martshala central	school" if schoolname == "martshalla central school"
replace schoolname = "pakshika central school" if schoolname == "pakshikha central school"
replace schoolname = "pelkhil higher secondary school" if schoolname == "pelkhil school"
replace schoolname = "radi middle secondary school" if schoolname == "radhi middle secondary school"
replace schoolname = "samdrupjongkhar middle secondary school" if schoolname == "samdrup jongkhar middle secondary school"
replace schoolname = "tashidingkha central school" if schoolname == "tashidingkha middle secondary school"
replace schoolname = "tashitse higher secondary	school" if schoolname == "trashitse higher secondary school"
replace schoolname = "uzorong central school" if schoolname == "udzorong central school"
replace schoolname = "zilukha middle secondary school" if schoolname == "zhilukha middle secondary school"

keep schoolname name phone sex citizenid score_* ca* th* indexnumber dateofbirth birthday passing_*
foreach i in schoolname name phone sex citizenid dateofbirth birthday {
    rename `i' `i'_s
}

save $temp/score10.dta, replace 


********************************************************************************
********************* Step 1: matched citizenid + fuzzy name 
********************************************************************************
gl criteria1 citizenid  

* open the score data 
use $temp/score10.dta, clear 
rename citizenid_s citizenid 

duplicates tag $criteria1, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/score10_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_s
save $temp/score10_step1d, replace

* open the survey data 
use "$temp/stem_analysis.dta", clear
drop sex 
ren sex_bl sex

drop name
gen name = subinstr(name_bl," ","",.) // remove space 

g day = substr(dateofbirth, 7, 2)
g month = substr(dateofbirth, 5, 2)
g year = substr(dateofbirth, 1, 4)

g birthday = day + month + year

generate str cid_string = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = cid_string
drop cid_string
describe citizenid

save $temp/data10, replace 

use $temp/data10, clear 

duplicates tag $criteria1, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data10_step1d, replace

* merging two unique data together using fully matched CID and fuzzy name
use $temp/score10_step1u
merge 1:1 $criteria1 using $temp/data10_step1u
matchit name_s name, g(namescore)
sum namescore if _merge==3

br name name_s if namescore<0.7 & _merge==3
* correct if names are not matched, leave for the next round of matching
replace _merge = . if inlist(citizenid, "10203000451", "10601004298", "10606002672", "11214002251", "11506005784", "11515000311", "11516003421", "11903000595", "12001004613")

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/score10_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
append using $temp/score10_step1d
save $temp/score10_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step1d
save $temp/data10_step2b, replace 
restore 


********************************************************************************
********************************************************************************
// Step 2: matching schoolname + name + sex + fuzzy date of birth 
********************************************************************************
********************************************************************************

gl criteria2 schoolname_bl name sex 

* open the score dataset 
use $temp/score10_step2b, clear 
cap rename name_s name 
cap rename schoolname_s schoolname_bl 
cap rename sex_s sex 

duplicates tag $criteria2, g(temp)


* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/score10_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename name name_s 
cap rename schoolname_bl schoolname_s 
cap rename sex sex_s 
save $temp/score10_step2d, replace

* open the survey data 
use $temp/data10_step2b.dta, clear

duplicates tag $criteria2, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data10_step2d, replace

* merging two unique data together using criteria 2
clear
use $temp/score10_step2u
merge 1:1 $criteria2 using $temp/data10_step2u
matchit dateofbirth_s dateofbirth, g(birthscore)
matchit birthday_s birthday, g(birthdscore)

replace _merge = . if _merge==3 & ~(birthscore>0.8 | birthdscore>0.8) // allow 1-digit/unit mismatch in date of birth 

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/score10_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
append using $temp/score10_step2d
save $temp/score10_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step2d
save $temp/data10_step3b, replace 
restore 

********************************************************************************
********************************************************************************
// Step 3: schoolname + sex + birth date + fuzzy name 
********************************************************************************
********************************************************************************
gl criteria3 schoolname_bl sex dateofbirth 

use $temp/score10_step3b, clear
rename schoolname_s schoolname_bl 
rename sex_s sex 
rename dateofbirth_s dateofbirth 

duplicates tag $criteria3, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score10_step3u, replace 
restore 

keep if temp~=0 
drop temp
cap { 
rename schoolname_bl schoolname_s  
rename sex sex_s  
rename dateofbirth dateofbirth_s 
} 
save $temp/score10_step3d, replace 


use $temp/data10_step3b, clear
duplicates tag $criteria3, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/data10_step3u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data10_step3d, replace

use $temp/score10_step3u, clear 
merge 1:1 $criteria3 using $temp/data10_step3u
matchit name name_s, g(namescore)

* manual check: accept all 
preserve 
keep if _merge==3 
keep uniqueid indexnumber 
save $temp/score10_step3m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
append using $temp/score10_step3d
save $temp/score10_step4b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step3d
save $temp/data10_step4b, replace 
restore 


********************************************************************************
********************************************************************************
// Step 4: sex + name + fuzzy phone (in case students transfered)
********************************************************************************
********************************************************************************
gl criteria4 sex name  

use $temp/score10_step4b, clear
rename sex_s sex 
rename name_s name 

duplicates tag $criteria4, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score10_step4u, replace 
restore 

keep if temp~=0 
drop temp
cap {  
rename sex sex_s  
rename name name_s
} 
save $temp/score10_step4d, replace 


use $temp/data10_step4b, clear
duplicates tag $criteria4, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/data10_step4u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data10_step4d, replace

use $temp/score10_step4u, clear 
merge 1:1 $criteria4 using $temp/data10_step4u
matchit phone_bl phone_s, g(phonescore)

replace _merge=. if phonescore<0.9 & _merge==3

* keep 
preserve 
keep if _merge==3 
keep uniqueid indexnumber
save $temp/score10_step4m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
append using $temp/score10_step4d
save $temp/score10_step5b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step4d
save $temp/data10_step5b, replace 
restore 

********************************************************************************
********************************************************************************
// Step 5: sex + phone + fuzzy name 
********************************************************************************
********************************************************************************
gl criteria5 sex phone_bl  

use $temp/score10_step5b, clear
rename sex_s sex 
rename phone_s phone_bl 

duplicates tag $criteria5, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score10_step5u, replace 
restore 

keep if temp~=0 
drop temp
cap {  
rename sex sex_s  
rename phone_bl phone_s
} 
save $temp/score10_step5d, replace 


use $temp/data10_step5b, clear
duplicates tag $criteria5, g(temp)
tab temp 

preserve 
keep if temp==0
drop temp
save $temp/data10_step5u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data10_step5d, replace

use $temp/score10_step5u, clear 
merge 1:1 $criteria5 using $temp/data10_step5u
matchit name name_s, g(namescore)

br name* namescore if _merge==3

replace _merge=. if indexnumber == "010211250034"

* keep 
preserve 
keep if _merge==3 
keep uniqueid indexnumber
save $temp/score10_step5m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
append using $temp/score10_step5d
save $temp/score10_step6b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step5d
save $temp/data10_step6b, replace 
restore 

* create a new anonymous score dataset to match with analysis dataset 
use $temp/score10_step1m, clear
g note = "criteria: perfect cid + fuzzy name"
append using $temp/score10_step2m
replace note = "criteria: school + name + sex + fuzzy birthday" if missing(note)
append using $temp/score10_step3m
replace note = "criteria: school + sex + birthday + fuzzy name" if missing(note)
append using $temp/score10_step4m
replace note = "criteria: sex + name + fuzzy phone" if missing(note)
append using $temp/score10_step5m
replace note = "criteria: sex + phone + fuzzy name" if missing(note)
count

merge 1:1 indexnumber using $temp/score10.dta, keep(matched) nogen 
drop *_s  
tab note
save "$clean/exam_score10.dta", replace 

* clear all temporary datasets 
global tempfilelist: dir "$temp/" files "score10*.dta"
	foreach tfile of global tempfilelist {
	erase "$temp/`tfile'"
}

global tempfilelist: dir "$temp/" files "data10*.dta"
	foreach tfile of global tempfilelist {
	erase "$temp/`tfile'"
}